1.1. Notebook2#

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

housing = pd.read_csv("./housing.csv") # https://github.com/ageron/data/tree/main/housing
housing["income_cat"] = pd.cut(housing["median_income"], bins=[0, 1.5, 3.0, 4.5, 6, np.inf], labels=[1, 2, 3, 4, 5])

strat_train_set, strat_test_set = train_test_split(housing, test_size=0.20, stratify=housing["income_cat"], random_state=42)
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[1], line 2
      1 import numpy as np
----> 2 import pandas as pd
      3 from sklearn.model_selection import train_test_split
      5 housing = pd.read_csv("./housing.csv") # https://github.com/ageron/data/tree/main/housing

ModuleNotFoundError: No module named 'pandas'
# Simple Feature Engineering via attribute combinations
from ydata_profiling import ProfileReport

housing["rooms_per_house"] = housing["total_rooms"] / housing["households"]
housing["bedrooms_ratio"] = housing["total_bedrooms"] / housing["total_rooms"]
housing["people_per_house"] = housing["population"] / housing["households"]
profile = ProfileReport(housing, title="Pandas Profiling Report")
profile

import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

numerical_columns = housing.select_dtypes(include=['number'])

# Compute the correlation matrix for numerical columns
correlation_matrix = numerical_columns.corr()


plt.figure(figsize=(10, 8))  # Set the figure size
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Correlation Matrix Heatmap")
<Figure size 1000x800 with 0 Axes>
<Axes: >
Text(0.5, 1.0, 'Correlation Matrix Heatmap')
../../_images/09f7e2a386e1d8fa9d1a2d67b61e197d6ff416b186f1db8502e1e42996e5e554.png
# Data Preprocessing -- fix missing values
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")

housing_num = housing.select_dtypes(include=[np.number])
imputer.fit(housing_num)
housing_num.describe()
pd.DataFrame(imputer.transform(housing_num), columns=housing_num.columns).describe()
SimpleImputer(strategy='median')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income
count 16512.000000 16512.000000 16512.000000 16512.000000 16344.000000 16512.000000 16512.000000 16512.000000
mean -119.573125 35.637746 28.577156 2639.402798 538.949094 1425.513929 499.990189 3.870428
std 2.000624 2.133294 12.585738 2185.287466 423.862079 1094.795467 382.865787 1.891936
min -124.350000 32.550000 1.000000 2.000000 1.000000 3.000000 1.000000 0.499900
25% -121.800000 33.930000 18.000000 1447.000000 296.000000 787.000000 279.000000 2.562500
50% -118.510000 34.260000 29.000000 2125.000000 434.000000 1167.000000 408.000000 3.538500
75% -118.010000 37.720000 37.000000 3154.000000 645.000000 1726.000000 603.000000 4.750000
max -114.490000 41.950000 52.000000 39320.000000 6210.000000 16305.000000 5358.000000 15.000100
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income
count 16512.000000 16512.000000 16512.000000 16512.000000 16512.000000 16512.000000 16512.000000 16512.000000
mean -119.573125 35.637746 28.577156 2639.402798 537.881298 1425.513929 499.990189 3.870428
std 2.000624 2.133294 12.585738 2185.287466 421.831667 1094.795467 382.865787 1.891936
min -124.350000 32.550000 1.000000 2.000000 1.000000 3.000000 1.000000 0.499900
25% -121.800000 33.930000 18.000000 1447.000000 297.000000 787.000000 279.000000 2.562500
50% -118.510000 34.260000 29.000000 2125.000000 434.000000 1167.000000 408.000000 3.538500
75% -118.010000 37.720000 37.000000 3154.000000 642.000000 1726.000000 603.000000 4.750000
max -114.490000 41.950000 52.000000 39320.000000 6210.000000 16305.000000 5358.000000 15.000100
# Data Preprocessing -- Handle text attributes
from sklearn.preprocessing import OneHotEncoder

### one hot encoder can handle unknown values by ignoring them! Look up documentation

housing_cat = housing.select_dtypes(include=['object']) # PandasV2
cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
housing_cat_1hot.toarray()
cat_encoder.categories_
array([[0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1.]])
[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
       dtype=object)]
# Data Preprocessing -- Feature Scaling
from sklearn.preprocessing import MinMaxScaler

min_max_scaler = MinMaxScaler(feature_range=(-1, 1))
housing_num_min_max_scaled = min_max_scaler.fit_transform(housing_num)
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

std_scaler = StandardScaler()
housing_num_std_scaled = std_scaler.fit_transform(housing_num)
# Data Preprocessing -- Custom Transformers

from sklearn.preprocessing import FunctionTransformer
from sklearn.cluster import KMeans

log_transformer = FunctionTransformer(np.log, inverse_func=np.exp) # inverse_func option
log_transformer.transform(housing[["population"]])
population
13096 7.362645
14973 6.501290
3785 6.331502
14689 7.520235
20507 7.555905
... ...
14207 6.843750
13105 7.257708
19301 7.942362
19121 7.452982
19888 6.525030

16512 rows × 1 columns

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_array, check_is_fitted

class StandardScalerClone(BaseEstimator, TransformerMixin):
    def __init__(self, with_mean=True):  # no *args or **kwargs!
        self.with_mean = with_mean

    def fit(self, X, y=None):  # y is required even though we don't use it
        X = check_array(X)  # checks that X is an array with finite float values
        self.mean_ = X.mean(axis=0)
        self.scale_ = X.std(axis=0)
        self.n_features_in_ = X.shape[1]  # every estimator stores this in fit()
        return self  # always return self!

    def transform(self, X):
        check_is_fitted(self)  # looks for learned attributes (with trailing _)
        X = check_array(X)
        assert self.n_features_in_ == X.shape[1]
        if self.with_mean:
            X = X - self.mean_
        return X / self.scale_
from sklearn.cluster import KMeans

class ClusterSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=10, gamma=1.0, random_state=None):
        self.n_clusters = n_clusters
        self.gamma = gamma
        self.random_state = random_state

    def fit(self, X, y=None, sample_weight=None):
        self.kmeans_ = KMeans(self.n_clusters, n_init=10,
                              random_state=self.random_state)
        self.kmeans_.fit(X, sample_weight=sample_weight)
        return self  # always return self!

    def transform(self, X):
        return rbf_kernel(X, self.kmeans_.cluster_centers_, gamma=self.gamma)

    def get_feature_names_out(self, names=None):
        return [f"Cluster {i} similarity" for i in range(self.n_clusters)]
import numpy as np
from sklearn.datasets import make_blobs
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cluster import KMeans


# Define the ClusterSimilarity class
class ClusterSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=10, gamma=1.0, random_state=None):
        self.n_clusters = n_clusters
        self.gamma = gamma
        self.random_state = random_state

    def fit(self, X, y=None, sample_weight=None):
        self.kmeans_ = KMeans(self.n_clusters, n_init=10, random_state=self.random_state)
        self.kmeans_.fit(X, sample_weight=sample_weight)
        return self  # always return self!

    def transform(self, X):
        return rbf_kernel(X, self.kmeans_.cluster_centers_, gamma=self.gamma)

    def fit_transform(self, X, y=None, sample_weight=None):
        self.fit(X, y, sample_weight)
        return self.transform(X)

    def get_feature_names_out(self, names=None):
        return [f"Cluster {i} similarity" for i in range(self.n_clusters)]

# Generate some sample data
X, y = make_blobs(n_samples=10, centers=3, n_features=2, random_state=42)

# Instantiate the ClusterSimilarity transformer
cluster_sim = ClusterSimilarity(n_clusters=3, gamma=0.5, random_state=42)

# Fit and transform the data
similarity_matrix = cluster_sim.fit_transform(X)

# Print the original data and the resulting similarity matrix
print("Original Data (X):")
print(X)
print("\nCluster Centers:")
print(cluster_sim.kmeans_.cluster_centers_)
print("\nSimilarity Matrix (fit_transform output):")
print(similarity_matrix)
Original Data (X):
[[-5.41397842 -7.10588589]
 [-7.42400992 -6.769187  ]
 [ 3.62704772  2.28741702]
 [-6.81209899 -8.30485778]
 [-2.26723535  7.10100588]
 [-2.97867201  9.55684617]
 [-0.92998481  9.78172086]
 [ 2.914961    1.41088215]
 [ 3.73185476  0.56086598]
 [-2.97261532  8.54855637]]

Cluster Centers:
[[-6.55002911 -7.39331023]
 [-2.28712687  8.74703232]
 [ 3.42462116  1.41972172]]

Similarity Matrix (fit_transform output):
[[5.03278314e-01 2.01625537e-57 1.78942476e-33]
 [5.61756878e-01 9.79826866e-59 7.61772432e-41]
 [1.44263700e-43 2.20758932e-17 6.72377177e-01]
 [6.37755829e-01 2.59768844e-68 5.12843842e-44]
 [2.49800830e-50 2.57973257e-01 9.04006383e-15]
 [6.95471207e-66 5.67215161e-01 5.23079534e-24]
 [1.22170360e-71 2.33118709e-01 4.99726363e-20]
 [5.18478201e-37 2.73497668e-18 8.78169364e-01]
 [2.01903751e-37 3.81418698e-23 6.59671807e-01]
 [1.08267553e-58 7.75192762e-01 1.19631119e-20]]
cluster_simil = ClusterSimilarity(n_clusters=10, gamma=1., random_state=42)
similarities = cluster_simil.fit_transform(housing[["latitude", "longitude"]],
                                           sample_weight=housing_labels)

housing_renamed = housing.rename(columns={
    "latitude": "Latitude", "longitude": "Longitude",
    "population": "Population",
    "median_house_value": "Median house value (ᴜsᴅ)"})
housing_renamed["Max cluster similarity"] = similarities.max(axis=1)

housing_renamed.plot(kind="scatter", x="Longitude", y="Latitude", grid=True,
                     s=housing_renamed["Population"] / 100, label="Population",
                     c="Max cluster similarity",
                     cmap="jet", colorbar=True,
                     legend=True, sharex=False, figsize=(10, 7))
plt.plot(cluster_simil.kmeans_.cluster_centers_[:, 1],
         cluster_simil.kmeans_.cluster_centers_[:, 0],
         linestyle="", color="black", marker="X", markersize=20,
         label="Cluster centers")
plt.legend(loc="upper right")
plt.show()
<Axes: xlabel='Longitude', ylabel='Latitude'>
[<matplotlib.lines.Line2D at 0x7fcc74ae80e0>]
<matplotlib.legend.Legend at 0x7fcc74864d40>
../../_images/2d986a1689faa82f0543f6b7409a59c6a188333dbe6821fc22c8945464902934.png
similarities[1,:]
array([5.82488504e-14, 9.90046143e-01, 1.62296800e-10, 3.84360842e-02,
       4.56888397e-04, 1.31326258e-13, 1.10891612e-01, 2.99695991e-26,
       6.32121447e-01, 1.49980122e-09])
from sklearn.pipeline import Pipeline, make_pipeline

num_pipeline = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
    ("standardize", StandardScaler()),
])

num_pipeline
num_pipeline.named_steps

x = pd.DataFrame(num_pipeline.fit_transform(housing_num), columns=num_pipeline.get_feature_names_out())
x["total_bedrooms"].isnull().sum()
Pipeline(steps=[('impute', SimpleImputer(strategy='median')),
                ('standardize', StandardScaler())])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
{'impute': SimpleImputer(strategy='median'), 'standardize': StandardScaler()}
np.int64(0)
num_pipeline = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())
from sklearn import set_config


num_pipeline
num_pipeline.named_steps
Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')),
                ('standardscaler', StandardScaler())])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
{'simpleimputer': SimpleImputer(strategy='median'),
 'standardscaler': StandardScaler()}
# Explicit
from sklearn.compose import ColumnTransformer

num_attribs = ["longitude", "latitude", "housing_median_age", "total_rooms",
               "total_bedrooms", "population", "households", "median_income"]
cat_attribs = ["ocean_proximity"]

num_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    StandardScaler())

cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore"))

preprocessing = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", cat_pipeline, cat_attribs),
])
from sklearn.compose import make_column_selector, make_column_transformer


num_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    StandardScaler())

cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore"))

preprocessing = make_column_transformer(
    (num_pipeline, make_column_selector(dtype_include=np.number)),
    (cat_pipeline, make_column_selector(dtype_include=object)),
)
housing_prepared = preprocessing.fit_transform(housing)
housing_prepared_fr = pd.DataFrame(
    housing_prepared,
    columns=preprocessing.get_feature_names_out(),
    index=housing.index)
housing_prepared_fr.head(2)
pipeline-1__longitude pipeline-1__latitude pipeline-1__housing_median_age pipeline-1__total_rooms pipeline-1__total_bedrooms pipeline-1__population pipeline-1__households pipeline-1__median_income pipeline-2__ocean_proximity_<1H OCEAN pipeline-2__ocean_proximity_INLAND pipeline-2__ocean_proximity_ISLAND pipeline-2__ocean_proximity_NEAR BAY pipeline-2__ocean_proximity_NEAR OCEAN
13096 -1.423037 1.013606 1.861119 0.311912 1.368167 0.137460 1.394812 -0.936491 0.0 0.0 0.0 1.0 0.0
14973 0.596394 -0.702103 0.907630 -0.308620 -0.435925 -0.693771 -0.373485 1.171942 1.0 0.0 0.0 0.0 0.0
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_array, check_is_fitted
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.cluster import KMeans
from sklearn.compose import ColumnTransformer



def column_ratio(X):
    return X[:, [0]] / X[:, [1]]

def ratio_name(function_transformer, feature_names_in):
    return ["ratio"]  # feature names out

def ratio_pipeline():
    return make_pipeline(
        SimpleImputer(strategy="median"),
        FunctionTransformer(column_ratio, feature_names_out=ratio_name),
        StandardScaler())

log_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    FunctionTransformer(np.log, feature_names_out="one-to-one"),
    StandardScaler())


cluster_simil = ClusterSimilarity(n_clusters=10, gamma=1., random_state=42)
default_num_pipeline = make_pipeline(SimpleImputer(strategy="median"),
                                     StandardScaler())
preprocessing = ColumnTransformer([
        ("bedrooms", ratio_pipeline(), ["total_bedrooms", "total_rooms"]),
        ("rooms_per_house", ratio_pipeline(), ["total_rooms", "households"]),
        ("people_per_house", ratio_pipeline(), ["population", "households"]),
        ("log", log_pipeline, ["total_bedrooms", "total_rooms", "population",
                               "households", "median_income"]),
        ("geo", cluster_simil, ["latitude", "longitude"]),
        ("cat", cat_pipeline, make_column_selector(dtype_include=object)),
    ],
    remainder=default_num_pipeline)  # one column remaining: housing_median_age

1.1.1. Code Explanation#

1.1.1.1. 1. column_ratio Function#

def column_ratio(X):
    return X[:, [0]] / X[:, [1]]
  • This function takes a 2D array X as input and computes the ratio of the first column (X[:, [0]]) to the second column (X[:, [1]]).

  • It assumes that X has at least two columns, and the operation is performed element-wise for all rows.


1.1.1.2. 2. ratio_name Function#

def ratio_name(function_transformer, feature_names_in):
    return ["ratio"]  # feature names out
  • This function is used to define the output feature name(s) for the FunctionTransformer that applies the column_ratio function.

  • It always returns a single feature name, "ratio", regardless of the input feature names.


1.1.1.3. 3. ratio_pipeline Function#

def ratio_pipeline():
    return make_pipeline(
        SimpleImputer(strategy="median"),
        FunctionTransformer(column_ratio, feature_names_out=ratio_name),
        StandardScaler())
  • This function creates a pipeline for preprocessing numerical data by:

    1. Imputing missing values using the median (SimpleImputer(strategy="median")).

    2. Transforming the data by applying the column_ratio function using FunctionTransformer.

    3. Standardizing the data using StandardScaler to ensure the output has a mean of 0 and a standard deviation of 1.


1.1.1.4. 4. log_pipeline#

log_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    FunctionTransformer(np.log, feature_names_out="one-to-one"),
    StandardScaler())
  • This pipeline preprocesses numerical data by:

    1. Imputing missing values with the median.

    2. Applying the natural logarithm transformation (np.log) to the data.

    3. Standardizing the transformed data.


1.1.1.5. 5. cluster_simil#

cluster_simil = ClusterSimilarity(n_clusters=10, gamma=1., random_state=42)
  • This creates a ClusterSimilarity object, which transforms data into cluster similarity features.

  • It clusters the data into 10 clusters and computes similarity scores for each cluster using a radial basis function (RBF) kernel with gamma=1.0.


1.1.1.6. 6. default_num_pipeline#

default_num_pipeline = make_pipeline(SimpleImputer(strategy="median"),
                                     StandardScaler())
  • This pipeline handles numerical data by imputing missing values with the median and standardizing the data.


1.1.1.7. 7. preprocessing#

preprocessing = ColumnTransformer([
        ("bedrooms", ratio_pipeline(), ["total_bedrooms", "total_rooms"]),
        ("rooms_per_house", ratio_pipeline(), ["total_rooms", "households"]),
        ("people_per_house", ratio_pipeline(), ["population", "households"]),
        ("log", log_pipeline, ["total_bedrooms", "total_rooms", "population",
                               "households", "median_income"]),
        ("geo", cluster_simil, ["latitude", "longitude"]),
        ("cat", cat_pipeline, make_column_selector(dtype_include=object)),
    ],
    remainder=default_num_pipeline)  # one column remaining: housing_median_age
  • Purpose: This ColumnTransformer applies different preprocessing pipelines to different subsets of features in the dataset.

  • Transformations:

    1. bedrooms: Applies the ratio_pipeline to compute the ratio of total_bedrooms to total_rooms.

    2. rooms_per_house: Applies the ratio_pipeline to compute the ratio of total_rooms to households.

    3. people_per_house: Applies the ratio_pipeline to compute the ratio of population to households.

    4. log: Applies the log_pipeline to transform several numerical features using the natural logarithm.

    5. geo: Applies the ClusterSimilarity transformation to latitude and longitude.

    6. cat: Applies a categorical pipeline (cat_pipeline) to all categorical features (selected using make_column_selector(dtype_include=object)).

  • remainder Parameter:

    • The remainder parameter specifies how to handle columns that are not explicitly listed in the ColumnTransformer.

    • Here, it is set to default_num_pipeline, meaning that any remaining columns (in this case, housing_median_age) will be processed using the default_num_pipeline (imputation + standardization).


1.1.2. Role of the remainder Parameter#

The remainder parameter in ColumnTransformer determines what happens to columns that are not explicitly mentioned in the transformations. It can take the following values:

  1. "drop" (default): Drops any columns not specified in the transformations.

  2. "passthrough": Leaves the remaining columns unchanged and includes them in the output.

  3. A pipeline or transformer: Applies the specified pipeline or transformer to the remaining columns.

In this code:

remainder=default_num_pipeline
  • The default_num_pipeline (imputation + standardization) is applied to the remaining column, housing_median_age.


1.1.3. Summary#

  • The code defines a preprocessing pipeline for a dataset, applying different transformations to different feature groups.

  • The remainder parameter ensures that any columns not explicitly mentioned (like housing_median_age) are still processed using a default pipeline.

  • The ratio_pipeline computes ratios between pairs of columns, while other pipelines handle logarithmic transformations, clustering, and categorical data.

ratio_pipeline().fit_transform(housing[["total_rooms", "households"]])
array([[-0.86602737],
       [ 0.0245495 ],
       [-0.04119332],
       ...,
       [-0.10998748],
       [-0.36093745],
       [-1.32397227]])
housing_prepared = preprocessing.fit_transform(housing)
housing_prepared.shape
(16512, 24)
preprocessing.get_feature_names_out()
array(['bedrooms__ratio', 'rooms_per_house__ratio',
       'people_per_house__ratio', 'log__total_bedrooms',
       'log__total_rooms', 'log__population', 'log__households',
       'log__median_income', 'geo__Cluster 0 similarity',
       'geo__Cluster 1 similarity', 'geo__Cluster 2 similarity',
       'geo__Cluster 3 similarity', 'geo__Cluster 4 similarity',
       'geo__Cluster 5 similarity', 'geo__Cluster 6 similarity',
       'geo__Cluster 7 similarity', 'geo__Cluster 8 similarity',
       'geo__Cluster 9 similarity', 'cat__ocean_proximity_<1H OCEAN',
       'cat__ocean_proximity_INLAND', 'cat__ocean_proximity_ISLAND',
       'cat__ocean_proximity_NEAR BAY', 'cat__ocean_proximity_NEAR OCEAN',
       'remainder__housing_median_age'], dtype=object)
from sklearn.linear_model import LinearRegression

lin_reg = make_pipeline(preprocessing, LinearRegression())
lin_reg.fit(housing, housing_labels)
housing_predictions = lin_reg.predict(housing)
housing_predictions[:5].round(-2)
housing_labels.iloc[:5].values
Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer',
                                                              SimpleImputer(strategy='median')),
                                                             ('standardscaler',
                                                              StandardScaler())]),
                                   transformers=[('bedrooms',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('functiontransformer',
                                                                   FunctionTransformer(feature_names_out=<function ratio_name at 0x7fc...
                                                   'median_income']),
                                                 ('geo',
                                                  ClusterSimilarity(random_state=42),
                                                  ['latitude', 'longitude']),
                                                 ('cat',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehotencoder',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7fcc54992720>)])),
                ('linearregression', LinearRegression())])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
array([242800., 375900., 127500.,  99400., 324600.])
array([458300., 483800., 101700.,  96100., 361800.])
lin_reg.get_params()
{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer',
                                                SimpleImputer(strategy='median')),
                                               ('standardscaler',
                                                StandardScaler())]),
                     transformers=[('bedrooms',
                                    Pipeline(steps=[('simpleimputer',
                                                     SimpleImputer(strategy='median')),
                                                    ('functiontransformer',
                                                     FunctionTransformer(feature_names_out=<function ratio_name at 0x7fcc549b54e0>,
                                                                         func=<function column_ratio...
                                    ['total_bedrooms', 'total_rooms', 'population',
                                     'households', 'median_income']),
                                   ('geo', ClusterSimilarity(random_state=42),
                                    ['latitude', 'longitude']),
                                   ('cat',
                                    Pipeline(steps=[('simpleimputer',
                                                     SimpleImputer(strategy='most_frequent')),
                                                    ('onehotencoder',
                                                     OneHotEncoder(handle_unknown='ignore'))]),
                                    <sklearn.compose._column_transformer.make_column_selector object at 0x7fcc54992720>)])),
  ('linearregression', LinearRegression())],
 'verbose': False,
 'columntransformer': ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer',
                                              SimpleImputer(strategy='median')),
                                             ('standardscaler',
                                              StandardScaler())]),
                   transformers=[('bedrooms',
                                  Pipeline(steps=[('simpleimputer',
                                                   SimpleImputer(strategy='median')),
                                                  ('functiontransformer',
                                                   FunctionTransformer(feature_names_out=<function ratio_name at 0x7fcc549b54e0>,
                                                                       func=<function column_ratio...
                                  ['total_bedrooms', 'total_rooms', 'population',
                                   'households', 'median_income']),
                                 ('geo', ClusterSimilarity(random_state=42),
                                  ['latitude', 'longitude']),
                                 ('cat',
                                  Pipeline(steps=[('simpleimputer',
                                                   SimpleImputer(strategy='most_frequent')),
                                                  ('onehotencoder',
                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7fcc54992720>)]),
 'linearregression': LinearRegression(),
 'columntransformer__force_int_remainder_cols': True,
 'columntransformer__n_jobs': None,
 'columntransformer__remainder__memory': None,
 'columntransformer__remainder__steps': [('simpleimputer',
   SimpleImputer(strategy='median')),
  ('standardscaler', StandardScaler())],
 'columntransformer__remainder__verbose': False,
 'columntransformer__remainder__simpleimputer': SimpleImputer(strategy='median'),
 'columntransformer__remainder__standardscaler': StandardScaler(),
 'columntransformer__remainder__simpleimputer__add_indicator': False,
 'columntransformer__remainder__simpleimputer__copy': True,
 'columntransformer__remainder__simpleimputer__fill_value': None,
 'columntransformer__remainder__simpleimputer__keep_empty_features': False,
 'columntransformer__remainder__simpleimputer__missing_values': nan,
 'columntransformer__remainder__simpleimputer__strategy': 'median',
 'columntransformer__remainder__standardscaler__copy': True,
 'columntransformer__remainder__standardscaler__with_mean': True,
 'columntransformer__remainder__standardscaler__with_std': True,
 'columntransformer__remainder': Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')),
                 ('standardscaler', StandardScaler())]),
 'columntransformer__sparse_threshold': 0.3,
 'columntransformer__transformer_weights': None,
 'columntransformer__transformers': [('bedrooms',
   Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')),
                   ('functiontransformer',
                    FunctionTransformer(feature_names_out=<function ratio_name at 0x7fcc549b54e0>,
                                        func=<function column_ratio at 0x7fcc549b6840>)),
                   ('standardscaler', StandardScaler())]),
   ['total_bedrooms', 'total_rooms']),
  ('rooms_per_house',
   Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')),
                   ('functiontransformer',
                    FunctionTransformer(feature_names_out=<function ratio_name at 0x7fcc549b54e0>,
                                        func=<function column_ratio at 0x7fcc549b6840>)),
                   ('standardscaler', StandardScaler())]),
   ['total_rooms', 'households']),
  ('people_per_house',
   Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')),
                   ('functiontransformer',
                    FunctionTransformer(feature_names_out=<function ratio_name at 0x7fcc549b54e0>,
                                        func=<function column_ratio at 0x7fcc549b6840>)),
                   ('standardscaler', StandardScaler())]),
   ['population', 'households']),
  ('log',
   Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')),
                   ('functiontransformer',
                    FunctionTransformer(feature_names_out='one-to-one',
                                        func=<ufunc 'log'>)),
                   ('standardscaler', StandardScaler())]),
   ['total_bedrooms',
    'total_rooms',
    'population',
    'households',
    'median_income']),
  ('geo', ClusterSimilarity(random_state=42), ['latitude', 'longitude']),
  ('cat',
   Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='most_frequent')),
                   ('onehotencoder', OneHotEncoder(handle_unknown='ignore'))]),
   <sklearn.compose._column_transformer.make_column_selector at 0x7fcc54992720>)],
 'columntransformer__verbose': False,
 'columntransformer__verbose_feature_names_out': True,
 'columntransformer__bedrooms': Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')),
                 ('functiontransformer',
                  FunctionTransformer(feature_names_out=<function ratio_name at 0x7fcc549b54e0>,
                                      func=<function column_ratio at 0x7fcc549b6840>)),
                 ('standardscaler', StandardScaler())]),
 'columntransformer__rooms_per_house': Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')),
                 ('functiontransformer',
                  FunctionTransformer(feature_names_out=<function ratio_name at 0x7fcc549b54e0>,
                                      func=<function column_ratio at 0x7fcc549b6840>)),
                 ('standardscaler', StandardScaler())]),
 'columntransformer__people_per_house': Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')),
                 ('functiontransformer',
                  FunctionTransformer(feature_names_out=<function ratio_name at 0x7fcc549b54e0>,
                                      func=<function column_ratio at 0x7fcc549b6840>)),
                 ('standardscaler', StandardScaler())]),
 'columntransformer__log': Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')),
                 ('functiontransformer',
                  FunctionTransformer(feature_names_out='one-to-one',
                                      func=<ufunc 'log'>)),
                 ('standardscaler', StandardScaler())]),
 'columntransformer__geo': ClusterSimilarity(random_state=42),
 'columntransformer__cat': Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='most_frequent')),
                 ('onehotencoder', OneHotEncoder(handle_unknown='ignore'))]),
 'columntransformer__bedrooms__memory': None,
 'columntransformer__bedrooms__steps': [('simpleimputer',
   SimpleImputer(strategy='median')),
  ('functiontransformer',
   FunctionTransformer(feature_names_out=<function ratio_name at 0x7fcc549b54e0>,
                       func=<function column_ratio at 0x7fcc549b6840>)),
  ('standardscaler', StandardScaler())],
 'columntransformer__bedrooms__verbose': False,
 'columntransformer__bedrooms__simpleimputer': SimpleImputer(strategy='median'),
 'columntransformer__bedrooms__functiontransformer': FunctionTransformer(feature_names_out=<function ratio_name at 0x7fcc549b54e0>,
                     func=<function column_ratio at 0x7fcc549b6840>),
 'columntransformer__bedrooms__standardscaler': StandardScaler(),
 'columntransformer__bedrooms__simpleimputer__add_indicator': False,
 'columntransformer__bedrooms__simpleimputer__copy': True,
 'columntransformer__bedrooms__simpleimputer__fill_value': None,
 'columntransformer__bedrooms__simpleimputer__keep_empty_features': False,
 'columntransformer__bedrooms__simpleimputer__missing_values': nan,
 'columntransformer__bedrooms__simpleimputer__strategy': 'median',
 'columntransformer__bedrooms__functiontransformer__accept_sparse': False,
 'columntransformer__bedrooms__functiontransformer__check_inverse': True,
 'columntransformer__bedrooms__functiontransformer__feature_names_out': <function __main__.ratio_name(function_transformer, feature_names_in)>,
 'columntransformer__bedrooms__functiontransformer__func': <function __main__.column_ratio(X)>,
 'columntransformer__bedrooms__functiontransformer__inv_kw_args': None,
 'columntransformer__bedrooms__functiontransformer__inverse_func': None,
 'columntransformer__bedrooms__functiontransformer__kw_args': None,
 'columntransformer__bedrooms__functiontransformer__validate': False,
 'columntransformer__bedrooms__standardscaler__copy': True,
 'columntransformer__bedrooms__standardscaler__with_mean': True,
 'columntransformer__bedrooms__standardscaler__with_std': True,
 'columntransformer__rooms_per_house__memory': None,
 'columntransformer__rooms_per_house__steps': [('simpleimputer',
   SimpleImputer(strategy='median')),
  ('functiontransformer',
   FunctionTransformer(feature_names_out=<function ratio_name at 0x7fcc549b54e0>,
                       func=<function column_ratio at 0x7fcc549b6840>)),
  ('standardscaler', StandardScaler())],
 'columntransformer__rooms_per_house__verbose': False,
 'columntransformer__rooms_per_house__simpleimputer': SimpleImputer(strategy='median'),
 'columntransformer__rooms_per_house__functiontransformer': FunctionTransformer(feature_names_out=<function ratio_name at 0x7fcc549b54e0>,
                     func=<function column_ratio at 0x7fcc549b6840>),
 'columntransformer__rooms_per_house__standardscaler': StandardScaler(),
 'columntransformer__rooms_per_house__simpleimputer__add_indicator': False,
 'columntransformer__rooms_per_house__simpleimputer__copy': True,
 'columntransformer__rooms_per_house__simpleimputer__fill_value': None,
 'columntransformer__rooms_per_house__simpleimputer__keep_empty_features': False,
 'columntransformer__rooms_per_house__simpleimputer__missing_values': nan,
 'columntransformer__rooms_per_house__simpleimputer__strategy': 'median',
 'columntransformer__rooms_per_house__functiontransformer__accept_sparse': False,
 'columntransformer__rooms_per_house__functiontransformer__check_inverse': True,
 'columntransformer__rooms_per_house__functiontransformer__feature_names_out': <function __main__.ratio_name(function_transformer, feature_names_in)>,
 'columntransformer__rooms_per_house__functiontransformer__func': <function __main__.column_ratio(X)>,
 'columntransformer__rooms_per_house__functiontransformer__inv_kw_args': None,
 'columntransformer__rooms_per_house__functiontransformer__inverse_func': None,
 'columntransformer__rooms_per_house__functiontransformer__kw_args': None,
 'columntransformer__rooms_per_house__functiontransformer__validate': False,
 'columntransformer__rooms_per_house__standardscaler__copy': True,
 'columntransformer__rooms_per_house__standardscaler__with_mean': True,
 'columntransformer__rooms_per_house__standardscaler__with_std': True,
 'columntransformer__people_per_house__memory': None,
 'columntransformer__people_per_house__steps': [('simpleimputer',
   SimpleImputer(strategy='median')),
  ('functiontransformer',
   FunctionTransformer(feature_names_out=<function ratio_name at 0x7fcc549b54e0>,
                       func=<function column_ratio at 0x7fcc549b6840>)),
  ('standardscaler', StandardScaler())],
 'columntransformer__people_per_house__verbose': False,
 'columntransformer__people_per_house__simpleimputer': SimpleImputer(strategy='median'),
 'columntransformer__people_per_house__functiontransformer': FunctionTransformer(feature_names_out=<function ratio_name at 0x7fcc549b54e0>,
                     func=<function column_ratio at 0x7fcc549b6840>),
 'columntransformer__people_per_house__standardscaler': StandardScaler(),
 'columntransformer__people_per_house__simpleimputer__add_indicator': False,
 'columntransformer__people_per_house__simpleimputer__copy': True,
 'columntransformer__people_per_house__simpleimputer__fill_value': None,
 'columntransformer__people_per_house__simpleimputer__keep_empty_features': False,
 'columntransformer__people_per_house__simpleimputer__missing_values': nan,
 'columntransformer__people_per_house__simpleimputer__strategy': 'median',
 'columntransformer__people_per_house__functiontransformer__accept_sparse': False,
 'columntransformer__people_per_house__functiontransformer__check_inverse': True,
 'columntransformer__people_per_house__functiontransformer__feature_names_out': <function __main__.ratio_name(function_transformer, feature_names_in)>,
 'columntransformer__people_per_house__functiontransformer__func': <function __main__.column_ratio(X)>,
 'columntransformer__people_per_house__functiontransformer__inv_kw_args': None,
 'columntransformer__people_per_house__functiontransformer__inverse_func': None,
 'columntransformer__people_per_house__functiontransformer__kw_args': None,
 'columntransformer__people_per_house__functiontransformer__validate': False,
 'columntransformer__people_per_house__standardscaler__copy': True,
 'columntransformer__people_per_house__standardscaler__with_mean': True,
 'columntransformer__people_per_house__standardscaler__with_std': True,
 'columntransformer__log__memory': None,
 'columntransformer__log__steps': [('simpleimputer',
   SimpleImputer(strategy='median')),
  ('functiontransformer',
   FunctionTransformer(feature_names_out='one-to-one', func=<ufunc 'log'>)),
  ('standardscaler', StandardScaler())],
 'columntransformer__log__verbose': False,
 'columntransformer__log__simpleimputer': SimpleImputer(strategy='median'),
 'columntransformer__log__functiontransformer': FunctionTransformer(feature_names_out='one-to-one', func=<ufunc 'log'>),
 'columntransformer__log__standardscaler': StandardScaler(),
 'columntransformer__log__simpleimputer__add_indicator': False,
 'columntransformer__log__simpleimputer__copy': True,
 'columntransformer__log__simpleimputer__fill_value': None,
 'columntransformer__log__simpleimputer__keep_empty_features': False,
 'columntransformer__log__simpleimputer__missing_values': nan,
 'columntransformer__log__simpleimputer__strategy': 'median',
 'columntransformer__log__functiontransformer__accept_sparse': False,
 'columntransformer__log__functiontransformer__check_inverse': True,
 'columntransformer__log__functiontransformer__feature_names_out': 'one-to-one',
 'columntransformer__log__functiontransformer__func': <ufunc 'log'>,
 'columntransformer__log__functiontransformer__inv_kw_args': None,
 'columntransformer__log__functiontransformer__inverse_func': None,
 'columntransformer__log__functiontransformer__kw_args': None,
 'columntransformer__log__functiontransformer__validate': False,
 'columntransformer__log__standardscaler__copy': True,
 'columntransformer__log__standardscaler__with_mean': True,
 'columntransformer__log__standardscaler__with_std': True,
 'columntransformer__geo__gamma': 1.0,
 'columntransformer__geo__n_clusters': 10,
 'columntransformer__geo__random_state': 42,
 'columntransformer__cat__memory': None,
 'columntransformer__cat__steps': [('simpleimputer',
   SimpleImputer(strategy='most_frequent')),
  ('onehotencoder', OneHotEncoder(handle_unknown='ignore'))],
 'columntransformer__cat__verbose': False,
 'columntransformer__cat__simpleimputer': SimpleImputer(strategy='most_frequent'),
 'columntransformer__cat__onehotencoder': OneHotEncoder(handle_unknown='ignore'),
 'columntransformer__cat__simpleimputer__add_indicator': False,
 'columntransformer__cat__simpleimputer__copy': True,
 'columntransformer__cat__simpleimputer__fill_value': None,
 'columntransformer__cat__simpleimputer__keep_empty_features': False,
 'columntransformer__cat__simpleimputer__missing_values': nan,
 'columntransformer__cat__simpleimputer__strategy': 'most_frequent',
 'columntransformer__cat__onehotencoder__categories': 'auto',
 'columntransformer__cat__onehotencoder__drop': None,
 'columntransformer__cat__onehotencoder__dtype': numpy.float64,
 'columntransformer__cat__onehotencoder__feature_name_combiner': 'concat',
 'columntransformer__cat__onehotencoder__handle_unknown': 'ignore',
 'columntransformer__cat__onehotencoder__max_categories': None,
 'columntransformer__cat__onehotencoder__min_frequency': None,
 'columntransformer__cat__onehotencoder__sparse_output': True,
 'linearregression__copy_X': True,
 'linearregression__fit_intercept': True,
 'linearregression__n_jobs': None,
 'linearregression__positive': False}
error_ratios = housing_predictions[:5].round(-2) / housing_labels.iloc[:5].values - 1
print(", ".join([f"{100 * ratio:.1f}%" for ratio in error_ratios]))
-47.0%, -22.3%, 25.4%, 3.4%, -10.3%
from sklearn.metrics import root_mean_squared_error

lin_rmse = root_mean_squared_error(housing_labels, housing_predictions)
lin_rmse
np.float64(68647.95686706669)
!pip install dagshub mlflow
Requirement already satisfied: dagshub in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (0.3.45)
Requirement already satisfied: mlflow in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (2.18.0)
Requirement already satisfied: PyYAML>=5 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from dagshub) (6.0.2)
Requirement already satisfied: appdirs>=1.4.4 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from dagshub) (1.4.4)
Requirement already satisfied: click>=8.0.4 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from dagshub) (8.1.7)
Requirement already satisfied: httpx>=0.23.0 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from dagshub) (0.27.2)
Requirement already satisfied: GitPython>=3.1.29 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from dagshub) (3.1.43)
Requirement already satisfied: rich>=13.1.0 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from dagshub) (13.9.4)
Requirement already satisfied: dacite~=1.6.0 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from dagshub) (1.6.0)
Requirement already satisfied: tenacity>=8.2.2 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from dagshub) (9.0.0)
Requirement already satisfied: gql[requests] in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from dagshub) (3.5.0)
Requirement already satisfied: dataclasses-json in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from dagshub) (0.6.7)
Requirement already satisfied: pandas in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from dagshub) (2.2.3)
Requirement already satisfied: treelib>=1.6.4 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from dagshub) (1.7.0)
Requirement already satisfied: pathvalidate>=3.0.0 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from dagshub) (3.2.1)
Requirement already satisfied: python-dateutil in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from dagshub) (2.9.0.post0)
Requirement already satisfied: boto3 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from dagshub) (1.35.76)
Requirement already satisfied: dagshub-annotation-converter>=0.1.0 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from dagshub) (0.1.2)
Requirement already satisfied: mlflow-skinny==2.18.0 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from mlflow) (2.18.0)
Requirement already satisfied: Flask<4 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from mlflow) (3.1.0)
Requirement already satisfied: alembic!=1.10.0,<2 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from mlflow) (1.14.0)
Requirement already satisfied: docker<8,>=4.0.0 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from mlflow) (7.1.0)
Requirement already satisfied: graphene<4 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from mlflow) (3.4.3)
Requirement already satisfied: markdown<4,>=3.3 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from mlflow) (3.7)
Requirement already satisfied: matplotlib<4 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from mlflow) (3.9.2)
Requirement already satisfied: numpy<3 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from mlflow) (2.0.2)
Requirement already satisfied: pyarrow<19,>=4.0.0 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from mlflow) (18.1.0)
Requirement already satisfied: scikit-learn<2 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from mlflow) (1.5.1)
Requirement already satisfied: scipy<2 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from mlflow) (1.13.1)
Requirement already satisfied: sqlalchemy<3,>=1.4.0 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from mlflow) (2.0.32)
Requirement already satisfied: Jinja2<4,>=2.11 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from mlflow) (3.1.4)
Requirement already satisfied: gunicorn<24 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from mlflow) (23.0.0)
Requirement already satisfied: cachetools<6,>=5.0.0 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from mlflow-skinny==2.18.0->mlflow) (5.5.0)
Requirement already satisfied: cloudpickle<4 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from mlflow-skinny==2.18.0->mlflow) (3.1.0)
Requirement already satisfied: databricks-sdk<1,>=0.20.0 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from mlflow-skinny==2.18.0->mlflow) (0.38.0)
Requirement already satisfied: importlib-metadata!=4.7.0,<9,>=3.7.0 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from mlflow-skinny==2.18.0->mlflow) (8.4.0)
Requirement already satisfied: opentelemetry-api<3,>=1.9.0 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from mlflow-skinny==2.18.0->mlflow) (1.28.2)
Requirement already satisfied: opentelemetry-sdk<3,>=1.9.0 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from mlflow-skinny==2.18.0->mlflow) (1.28.2)
Requirement already satisfied: packaging<25 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from mlflow-skinny==2.18.0->mlflow) (24.1)
Requirement already satisfied: protobuf<6,>=3.12.0 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from mlflow-skinny==2.18.0->mlflow) (5.29.1)
Requirement already satisfied: requests<3,>=2.17.3 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from mlflow-skinny==2.18.0->mlflow) (2.32.3)
Requirement already satisfied: sqlparse<1,>=0.4.0 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from mlflow-skinny==2.18.0->mlflow) (0.5.2)
Requirement already satisfied: Mako in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from alembic!=1.10.0,<2->mlflow) (1.3.7)
Requirement already satisfied: typing-extensions>=4 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from alembic!=1.10.0,<2->mlflow) (4.12.2)
Requirement already satisfied: lxml in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from dagshub-annotation-converter>=0.1.0->dagshub) (5.3.0)
Requirement already satisfied: pillow in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from dagshub-annotation-converter>=0.1.0->dagshub) (10.4.0)
Requirement already satisfied: pydantic>=2.0.0 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from dagshub-annotation-converter>=0.1.0->dagshub) (2.10.3)
Requirement already satisfied: urllib3>=1.26.0 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from docker<8,>=4.0.0->mlflow) (2.2.2)
Requirement already satisfied: Werkzeug>=3.1 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from Flask<4->mlflow) (3.1.3)
Requirement already satisfied: itsdangerous>=2.2 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from Flask<4->mlflow) (2.2.0)
Requirement already satisfied: blinker>=1.9 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from Flask<4->mlflow) (1.9.0)
Requirement already satisfied: gitdb<5,>=4.0.1 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from GitPython>=3.1.29->dagshub) (4.0.11)
Requirement already satisfied: graphql-core<3.3,>=3.1 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from graphene<4->mlflow) (3.2.5)
Requirement already satisfied: graphql-relay<3.3,>=3.1 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from graphene<4->mlflow) (3.2.0)
Requirement already satisfied: anyio in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from httpx>=0.23.0->dagshub) (4.4.0)
Requirement already satisfied: certifi in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from httpx>=0.23.0->dagshub) (2024.7.4)
Requirement already satisfied: httpcore==1.* in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from httpx>=0.23.0->dagshub) (1.0.5)
Requirement already satisfied: idna in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from httpx>=0.23.0->dagshub) (3.8)
Requirement already satisfied: sniffio in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from httpx>=0.23.0->dagshub) (1.3.1)
Requirement already satisfied: h11<0.15,>=0.13 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from httpcore==1.*->httpx>=0.23.0->dagshub) (0.14.0)
Requirement already satisfied: MarkupSafe>=2.0 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from Jinja2<4,>=2.11->mlflow) (2.1.5)
Requirement already satisfied: contourpy>=1.0.1 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from matplotlib<4->mlflow) (1.2.1)
Requirement already satisfied: cycler>=0.10 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from matplotlib<4->mlflow) (0.12.1)
Requirement already satisfied: fonttools>=4.22.0 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from matplotlib<4->mlflow) (4.53.1)
Requirement already satisfied: kiwisolver>=1.3.1 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from matplotlib<4->mlflow) (1.4.5)
Requirement already satisfied: pyparsing>=2.3.1 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from matplotlib<4->mlflow) (3.1.4)
Requirement already satisfied: pytz>=2020.1 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from pandas->dagshub) (2024.1)
Requirement already satisfied: tzdata>=2022.7 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from pandas->dagshub) (2024.1)
Requirement already satisfied: six>=1.5 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from python-dateutil->dagshub) (1.16.0)
Requirement already satisfied: markdown-it-py>=2.2.0 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from rich>=13.1.0->dagshub) (3.0.0)
Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from rich>=13.1.0->dagshub) (2.18.0)
Requirement already satisfied: joblib>=1.2.0 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from scikit-learn<2->mlflow) (1.4.2)
Requirement already satisfied: threadpoolctl>=3.1.0 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from scikit-learn<2->mlflow) (3.5.0)
Requirement already satisfied: greenlet!=0.4.17 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from sqlalchemy<3,>=1.4.0->mlflow) (3.0.3)
Requirement already satisfied: botocore<1.36.0,>=1.35.76 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from boto3->dagshub) (1.35.76)
Requirement already satisfied: jmespath<2.0.0,>=0.7.1 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from boto3->dagshub) (1.0.1)
Requirement already satisfied: s3transfer<0.11.0,>=0.10.0 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from boto3->dagshub) (0.10.4)
Requirement already satisfied: marshmallow<4.0.0,>=3.18.0 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from dataclasses-json->dagshub) (3.23.1)
Requirement already satisfied: typing-inspect<1,>=0.4.0 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from dataclasses-json->dagshub) (0.9.0)
Requirement already satisfied: yarl<2.0,>=1.6 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from gql[requests]->dagshub) (1.18.3)
Requirement already satisfied: backoff<3.0,>=1.11.1 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from gql[requests]->dagshub) (2.2.1)
Requirement already satisfied: requests-toolbelt<2,>=1.0.0 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from gql[requests]->dagshub) (1.0.0)
Requirement already satisfied: google-auth~=2.0 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from databricks-sdk<1,>=0.20.0->mlflow-skinny==2.18.0->mlflow) (2.36.0)
Requirement already satisfied: smmap<6,>=3.0.1 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from gitdb<5,>=4.0.1->GitPython>=3.1.29->dagshub) (5.0.1)
Requirement already satisfied: zipp>=0.5 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from importlib-metadata!=4.7.0,<9,>=3.7.0->mlflow-skinny==2.18.0->mlflow) (3.20.1)
Requirement already satisfied: mdurl~=0.1 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from markdown-it-py>=2.2.0->rich>=13.1.0->dagshub) (0.1.2)
Requirement already satisfied: deprecated>=1.2.6 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from opentelemetry-api<3,>=1.9.0->mlflow-skinny==2.18.0->mlflow) (1.2.15)
Requirement already satisfied: opentelemetry-semantic-conventions==0.49b2 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from opentelemetry-sdk<3,>=1.9.0->mlflow-skinny==2.18.0->mlflow) (0.49b2)
Requirement already satisfied: annotated-types>=0.6.0 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from pydantic>=2.0.0->dagshub-annotation-converter>=0.1.0->dagshub) (0.7.0)
Requirement already satisfied: pydantic-core==2.27.1 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from pydantic>=2.0.0->dagshub-annotation-converter>=0.1.0->dagshub) (2.27.1)
Requirement already satisfied: charset-normalizer<4,>=2 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from requests<3,>=2.17.3->mlflow-skinny==2.18.0->mlflow) (3.3.2)
Requirement already satisfied: mypy-extensions>=0.3.0 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from typing-inspect<1,>=0.4.0->dataclasses-json->dagshub) (1.0.0)
Requirement already satisfied: multidict>=4.0 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from yarl<2.0,>=1.6->gql[requests]->dagshub) (6.1.0)
Requirement already satisfied: propcache>=0.2.0 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from yarl<2.0,>=1.6->gql[requests]->dagshub) (0.2.1)
Requirement already satisfied: wrapt<2,>=1.10 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from deprecated>=1.2.6->opentelemetry-api<3,>=1.9.0->mlflow-skinny==2.18.0->mlflow) (1.17.0)
Requirement already satisfied: pyasn1-modules>=0.2.1 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from google-auth~=2.0->databricks-sdk<1,>=0.20.0->mlflow-skinny==2.18.0->mlflow) (0.4.1)
Requirement already satisfied: rsa<5,>=3.1.4 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from google-auth~=2.0->databricks-sdk<1,>=0.20.0->mlflow-skinny==2.18.0->mlflow) (4.9)
Requirement already satisfied: pyasn1<0.7.0,>=0.4.6 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from pyasn1-modules>=0.2.1->google-auth~=2.0->databricks-sdk<1,>=0.20.0->mlflow-skinny==2.18.0->mlflow) (0.6.1)
import os

MLFLOW_TRACKING_URI="https://dagshub.com/mkzia/fall2024_housing_model.mlflow"
os.environ['MLFLOW_TRACKING_USERNAME']='mkzia'
os.environ['MLFLOW_TRACKING_PASSWORD']='0d48244c4d4e8778139d46eebe26abaafb053a20'


import mlflow
from mlflow.models import infer_signature

# Set our tracking server uri for logging
mlflow.set_tracking_uri(uri=MLFLOW_TRACKING_URI)

# Create a new MLflow Experiment
mlflow.set_experiment("fall24_house_pricing")

score = lin_rmse
params = {"test": 1}
with mlflow.start_run():
      # Log the hyperparameters
      mlflow.log_params(params)

      # Log metrics
      mlflow.log_metric("RMSE", score)
      # Infer the model signature
      signature = infer_signature(housing, lin_reg.predict(housing))

      # Log the model
      model_info = mlflow.sklearn.log_model(
          sk_model=lin_reg,
          artifact_path="housing_model",
          signature=signature,
          input_example=housing,
          registered_model_name="LinearRegression",
      )
<Experiment: artifact_location='mlflow-artifacts:/05b162d4f80c4138b6ea264a8866334a', creation_time=1733441612548, experiment_id='0', last_update_time=1733441612548, lifecycle_stage='active', name='fall24_house_pricing', tags={}>
Successfully registered model 'LinearRegression'.
2024/12/05 18:34:53 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LinearRegression, version 1
Created version '1' of model 'LinearRegression'.
🏃 View run mercurial-dog-170 at: https://dagshub.com/mkzia/fall2024_housing_model.mlflow/#/experiments/0/runs/a68e5c0518674d1f99feb77f71c57ba7
🧪 View experiment at: https://dagshub.com/mkzia/fall2024_housing_model.mlflow/#/experiments/0
from sklearn.tree import DecisionTreeRegressor

tree_reg = make_pipeline(preprocessing, DecisionTreeRegressor(random_state=42))
tree_reg.fit(housing, housing_labels)
housing_predictions = tree_reg.predict(housing)
tree_rmse = root_mean_squared_error(housing_labels, housing_predictions)
tree_rmse
Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer',
                                                              SimpleImputer(strategy='median')),
                                                             ('standardscaler',
                                                              StandardScaler())]),
                                   transformers=[('bedrooms',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('functiontransformer',
                                                                   FunctionTransformer(feature_names_out=<function ratio_name at 0x7fc...
                                                  ClusterSimilarity(random_state=42),
                                                  ['latitude', 'longitude']),
                                                 ('cat',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehotencoder',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7fcc54992720>)])),
                ('decisiontreeregressor',
                 DecisionTreeRegressor(random_state=42))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
np.float64(0.0)
from sklearn.model_selection import cross_val_score

tree_rmses = -cross_val_score(tree_reg, housing, housing_labels,
                              scoring="neg_root_mean_squared_error", cv=10)

# The cross_val_score function returns scores where higher is better.
# For metrics like RMSE (where lower is better), scikit-learn negates the values to fit this convention.
tree_rmses.mean()
pd.Series(tree_rmses).describe()
np.float64(67153.31827313449)
count       10.000000
mean     67153.318273
std       1963.580924
min      63925.253106
25%      66083.277180
50%      66795.829871
75%      68074.018403
max      70664.635833
dtype: float64
tree_reg[1].get_params()
{'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'random_state': 42,
 'splitter': 'best'}
import os

MLFLOW_TRACKING_URI="https://dagshub.com/mkzia/fall2024_housing_model.mlflow"
os.environ['MLFLOW_TRACKING_USERNAME']='mkzia'
os.environ['MLFLOW_TRACKING_PASSWORD']='0d48244c4d4e8778139d46eebe26abaafb053a20'


import mlflow
from mlflow.models import infer_signature

# Set our tracking server uri for logging
mlflow.set_tracking_uri(uri=MLFLOW_TRACKING_URI)

# Create a new MLflow Experiment
mlflow.set_experiment("fall24_house_pricing")

score = tree_rmses.mean()
params = tree_reg[1].get_params()
with mlflow.start_run():
      # Log the hyperparameters
      mlflow.log_params(params)

      # Log metrics
      mlflow.log_metric("RMSE", score)
      # Infer the model signature
      signature = infer_signature(housing, tree_reg.predict(housing))

      # Log the model
      model_info = mlflow.sklearn.log_model(
          sk_model=tree_reg,
          artifact_path="housing_model",
          signature=signature,
          input_example=housing,
          registered_model_name="DecisionTreeRegressor",
      )
<Experiment: artifact_location='mlflow-artifacts:/05b162d4f80c4138b6ea264a8866334a', creation_time=1733441612548, experiment_id='0', last_update_time=1733441612548, lifecycle_stage='active', name='fall24_house_pricing', tags={}>
Successfully registered model 'DecisionTreeRegressor'.
2024/12/05 18:42:35 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: DecisionTreeRegressor, version 1
Created version '1' of model 'DecisionTreeRegressor'.
🏃 View run powerful-lamb-609 at: https://dagshub.com/mkzia/fall2024_housing_model.mlflow/#/experiments/0/runs/49cc66617a91443eb5b583be22e5313c
🧪 View experiment at: https://dagshub.com/mkzia/fall2024_housing_model.mlflow/#/experiments/0
lin_rmses = -cross_val_score(lin_reg, housing, housing_labels,
                              scoring="neg_root_mean_squared_error", cv=10)
pd.Series(lin_rmses).describe()
count       10.000000
mean     69847.923224
std       4078.407329
min      65659.761079
25%      68088.799156
50%      68697.591463
75%      69800.966364
max      80685.254832
dtype: float64
from sklearn.ensemble import RandomForestRegressor

forest_reg = make_pipeline(preprocessing,
                           RandomForestRegressor(random_state=42))
# forest_rmses = -cross_val_score(forest_reg, housing, housing_labels,
#                                 scoring="neg_root_mean_squared_error", cv=10)
# pd.Series(forest_rmses).describe()
forest_reg.fit(housing, housing_labels)
housing_predictions = forest_reg.predict(housing)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[18], line 3
      1 forest_reg.fit(housing, housing_labels)
      2 housing_predictions = forest_reg.predict(housing)
----> 3 forest_rmse = root_mean_squared_error(housing_labels, housing_predictions)
      4 forest_rmse

NameError: name 'root_mean_squared_error' is not defined
from sklearn.metrics import root_mean_squared_error
forest_rmse = root_mean_squared_error(housing_labels, housing_predictions)
forest_rmse
np.float64(17547.52124624957)
from sklearn.model_selection import GridSearchCV

full_pipeline = Pipeline([
    ("preprocessing", preprocessing),
    ("random_forest", RandomForestRegressor(random_state=42)),
])
param_grid = [
    {'preprocessing__geo__n_clusters': [5, 8, 10],
     'random_forest__max_features': [4, 6, 8]},
    {'preprocessing__geo__n_clusters': [10, 15],
     'random_forest__max_features': [6, 8, 10]},
]
grid_search = GridSearchCV(full_pipeline, param_grid, cv=3,
                           scoring='neg_root_mean_squared_error')
grid_search.fit(housing, housing_labels)
GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('preprocessing',
                                        ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer',
                                                                                     SimpleImputer(strategy='median')),
                                                                                    ('standardscaler',
                                                                                     StandardScaler())]),
                                                          transformers=[('bedrooms',
                                                                         Pipeline(steps=[('simpleimputer',
                                                                                          SimpleImputer(strategy='median')),
                                                                                         ('functiontransformer',
                                                                                          FunctionTransformer(feature_names_out=<f...
                                                                         <sklearn.compose._column_transformer.make_column_selector object at 0x7fcc54992720>)])),
                                       ('random_forest',
                                        RandomForestRegressor(random_state=42))]),
             param_grid=[{'preprocessing__geo__n_clusters': [5, 8, 10],
                          'random_forest__max_features': [4, 6, 8]},
                         {'preprocessing__geo__n_clusters': [10, 15],
                          'random_forest__max_features': [6, 8, 10]}],
             scoring='neg_root_mean_squared_error')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
grid_search.best_params_
-grid_search.best_score_
{'preprocessing__geo__n_clusters': 15, 'random_forest__max_features': 6}
np.float64(43952.588192716525)
cv_res = pd.DataFrame(grid_search.cv_results_)
cv_res.sort_values(by="mean_test_score", ascending=False, inplace=True)

# extra code – these few lines of code just make the DataFrame look nicer
cv_res = cv_res[["param_preprocessing__geo__n_clusters",
                 "param_random_forest__max_features", "split0_test_score",
                 "split1_test_score", "split2_test_score", "mean_test_score"]]
score_cols = ["split0", "split1", "split2", "mean_test_rmse"]
cv_res.columns = ["n_clusters", "max_features"] + score_cols
cv_res[score_cols] = -cv_res[score_cols].round().astype(np.int64)

cv_res.head()
n_clusters max_features split0 split1 split2 mean_test_rmse
12 15 6 43536 43753 44569 43953
13 15 8 44084 44205 44863 44384
14 15 10 44368 44496 45200 44688
7 10 6 44251 44628 45857 44912
9 10 6 44251 44628 45857 44912
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
# from xgboost.sklearn import XGBRegressor

y_train = train['median_house_value']
X_train = train.drop('median_house_value', axis=1)

y_test = test['median_house_value']
X_test = test.drop('median_house_value', axis=1)

pipelines = {
    'ridge': make_pipeline(preprocessing, Ridge()),
    'rf': make_pipeline(preprocessing, RandomForestRegressor()),
    'gb': make_pipeline(preprocessing, GradientBoostingRegressor()),
    # 'xg': make_pipeline(preprocessing, XGBRegressor()),
}


grid = {
    'ridge':{'ridge__alpha':[0.05, 0.25, 0.5, 1.0]},
    'rf':{
        'randomforestregressor__n_estimators':[50,100,150],
        'randomforestregressor__max_depth':[5,6,7,None]
    },
    'gb':{
        'gradientboostingregressor__n_estimators':[50,100,150],
        'gradientboostingregressor__max_depth':[5,6,7, None]
    },
    # 'xg':{
    #    'xgbregressor__n_estimators':[50,100,150],
    #    'xgbregressor__max_depth':[5,6,7,None]
    # }
}


from sklearn.model_selection import GridSearchCV
import time
total_start = time.monotonic()
fit_models = {}
for algo, pipeline in pipelines.items():
    try:
        print(algo)
        start = time.monotonic()
        model = GridSearchCV(pipeline, grid[algo], n_jobs=-1, cv=10, scoring='r2')
        model.fit(X_train, y_train)
        fit_models[algo] = model
        end = time.monotonic()
        print(algo, end-start)
    except Exception as e:
        print(f'Model {algo} had an error {e}')

end = time.monotonic()
print(end-total_start)


# https://github.com/mkzia/eas503/blob/master/old/spring2024/week14/gcolab.ipynb
import os

MLFLOW_TRACKING_URI="https://dagshub.com/mkzia/house_models.mlflow"
os.environ['MLFLOW_TRACKING_USERNAME']='mkzia'
os.environ['MLFLOW_TRACKING_PASSWORD']='bbbc0c41d162cdc2a38ab1aacdc1a8ff6987d08f'


import mlflow
from mlflow.models import infer_signature

# Set our tracking server uri for logging
mlflow.set_tracking_uri(uri=MLFLOW_TRACKING_URI)

# Create a new MLflow Experiment
mlflow.set_experiment("median_house_pricing")

# Start an MLflow run
for algo, model in fit_models.items():
  score = model.best_score_
  params = model.best_params_
  with mlflow.start_run():
      # Log the hyperparameters
      mlflow.log_params(params)

      # Log metrics
      mlflow.log_metric("r2", score)
      # Infer the model signature
      signature = infer_signature(X_train, model.best_estimator_.predict(X_train))

      # Log the model
      model_info = mlflow.sklearn.log_model(
          sk_model=model,
          artifact_path="housing_model",
          signature=signature,
          input_example=X_train,
          registered_model_name=algo,
      )
import joblib

# Save the model
joblib.dump(forest_reg, 'random_forest_model_v1.pkl')
['random_forest_model_v1.pkl']
model = grid_search.best_estimator_
root_mean_squared_error(housing_labels, model.predict(housing))
array([438743.91630435, 457343.08      , 108316.        , ...,
       144973.0990991 , 489995.91      , 232980.        ])
np.float64(16012.022381795468)
test = strat_test_set.drop("median_house_value", axis=1)
test_labels = strat_test_set["median_house_value"].copy()
root_mean_squared_error(test_labels, model.predict(test))
np.float64(43426.577409177575)
import joblib
final_model_reloaded = joblib.load("random_forest_model_v1.pkl")
root_mean_squared_error(housing_labels, final_model_reloaded.predict(housing))
np.float64(17547.52124624957)
# GET
# POST
!pip install dill
Collecting dill
  Using cached dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Using cached dill-0.3.9-py3-none-any.whl (119 kB)
Installing collected packages: dill
Successfully installed dill-0.3.9
import dill
with open('rfr_v1.pkl', 'wb') as f:
    dill.dump(forest_reg, f)
import dill
with open('rfr_v1.pkl', 'rb') as f:
    reloaded_model = dill.load(f)
root_mean_squared_error(housing_labels, reloaded_model.predict(housing))
np.float64(17547.52124624957)
housing
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income ocean_proximity
13096 -122.42 37.80 52.0 3321.0 1115.0 1576.0 1034.0 2.0987 NEAR BAY
14973 -118.38 34.14 40.0 1965.0 354.0 666.0 357.0 6.0876 <1H OCEAN
3785 -121.98 38.36 33.0 1083.0 217.0 562.0 203.0 2.4330 INLAND
14689 -117.11 33.75 17.0 4174.0 851.0 1845.0 780.0 2.2618 INLAND
20507 -118.15 33.77 36.0 4366.0 1211.0 1912.0 1172.0 3.5292 NEAR OCEAN
... ... ... ... ... ... ... ... ... ...
14207 -118.40 33.86 41.0 2237.0 597.0 938.0 523.0 4.7105 <1H OCEAN
13105 -119.31 36.32 23.0 2945.0 592.0 1419.0 532.0 2.5733 INLAND
19301 -117.06 32.59 13.0 3920.0 775.0 2814.0 760.0 4.0616 NEAR OCEAN
19121 -118.40 34.06 37.0 3781.0 873.0 1725.0 838.0 4.1455 <1H OCEAN
19888 -122.41 37.66 44.0 431.0 195.0 682.0 212.0 3.2833 NEAR OCEAN

16512 rows × 9 columns

!pip install requests
Requirement already satisfied: requests in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (2.32.3)
Requirement already satisfied: charset-normalizer<4,>=2 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from requests) (3.3.2)
Requirement already satisfied: idna<4,>=2.5 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from requests) (3.8)
Requirement already satisfied: urllib3<3,>=1.21.1 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from requests) (2.2.2)
Requirement already satisfied: certifi>=2017.4.17 in /home/mkzia/anaconda3/envs/eas503book/lib/python3.12/site-packages (from requests) (2024.7.4)
row = housing.iloc[0]
row.to_dict()
{'longitude': -122.42,
 'latitude': 37.8,
 'housing_median_age': 52.0,
 'total_rooms': 3321.0,
 'total_bedrooms': 1115.0,
 'population': 1576.0,
 'households': 1034.0,
 'median_income': 2.0987,
 'ocean_proximity': 'NEAR BAY'}
import json
data = json.dumps(row.to_dict(), indent=2)
print(data)
{
  "longitude": -122.42,
  "latitude": 37.8,
  "housing_median_age": 52.0,
  "total_rooms": 3321.0,
  "total_bedrooms": 1115.0,
  "population": 1576.0,
  "households": 1034.0,
  "median_income": 2.0987,
  "ocean_proximity": "NEAR BAY"
}
import requests
import json
data = {
  "longitude": -122.42,
  "latitude": 37.8,
  "housing_median_age": 52.0,
  "total_rooms": 3321.0,
  "total_bedrooms": 1115.0,
  "population": 1576.0,
  "households": 1034.0,
  "median_income": 2.0987,
  "ocean_proximity": "NEAR BAY"
}

data = json.dumps(data, indent=2)
print(data)
r = requests.post('http://159.65.246.99:8002/predict', data=data)
r.json()
{
  "longitude": -122.42,
  "latitude": 37.8,
  "housing_median_age": 52.0,
  "total_rooms": 3321.0,
  "total_bedrooms": 1115.0,
  "population": 1576.0,
  "households": 1034.0,
  "median_income": 2.0987,
  "ocean_proximity": "NEAR BAY"
}
---------------------------------------------------------------------------
JSONDecodeError                           Traceback (most recent call last)
File ~/anaconda3/envs/eas503book/lib/python3.12/site-packages/requests/models.py:974, in Response.json(self, **kwargs)
    973 try:
--> 974     return complexjson.loads(self.text, **kwargs)
    975 except JSONDecodeError as e:
    976     # Catch JSON-related errors and raise as requests.JSONDecodeError
    977     # This aliases json.JSONDecodeError and simplejson.JSONDecodeError

File ~/anaconda3/envs/eas503book/lib/python3.12/json/__init__.py:346, in loads(s, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)
    343 if (cls is None and object_hook is None and
    344         parse_int is None and parse_float is None and
    345         parse_constant is None and object_pairs_hook is None and not kw):
--> 346     return _default_decoder.decode(s)
    347 if cls is None:

File ~/anaconda3/envs/eas503book/lib/python3.12/json/decoder.py:337, in JSONDecoder.decode(self, s, _w)
    333 """Return the Python representation of ``s`` (a ``str`` instance
    334 containing a JSON document).
    335 
    336 """
--> 337 obj, end = self.raw_decode(s, idx=_w(s, 0).end())
    338 end = _w(s, end).end()

File ~/anaconda3/envs/eas503book/lib/python3.12/json/decoder.py:355, in JSONDecoder.raw_decode(self, s, idx)
    354 except StopIteration as err:
--> 355     raise JSONDecodeError("Expecting value", s, err.value) from None
    356 return obj, end

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

During handling of the above exception, another exception occurred:

JSONDecodeError                           Traceback (most recent call last)
Cell In[2], line 18
     16 print(data)
     17 r = requests.post('http://159.65.246.99:8002/predict', data=data)
---> 18 r.json()

File ~/anaconda3/envs/eas503book/lib/python3.12/site-packages/requests/models.py:978, in Response.json(self, **kwargs)
    974     return complexjson.loads(self.text, **kwargs)
    975 except JSONDecodeError as e:
    976     # Catch JSON-related errors and raise as requests.JSONDecodeError
    977     # This aliases json.JSONDecodeError and simplejson.JSONDecodeError
--> 978     raise RequestsJSONDecodeError(e.msg, e.doc, e.pos)

JSONDecodeError: Expecting value: line 1 column 1 (char 0)
r
<Response [500]>
housing_labels.iloc[3]
np.float64(96100.0)
import math
housing_orignal = pd.read_csv("./housing.csv") # https://github.com/ageron/data/tree/main/housing

slider_fields = [
    "longitude",
    "latitude",
    "housing_median_age",
    "total_rooms",
    "total_bedrooms",
    "population",
    "households",
    "median_income",
]

single_select_fields = ["ocean_proximity"]
from collections import defaultdict
streamlit_field_data = defaultdict(dict)

for field in slider_fields:
    streamlit_field_data["slider_fields"][field] = [math.floor(housing_orignal[field].min()), math.ceil(housing_orignal[field].max())]

for field in single_select_fields:
    streamlit_field_data["single_select_fields"][field] = sorted(housing_orignal["ocean_proximity"].unique())

import json
json.dump(streamlit_field_data, open("streamlit_options.json", "w"), indent=2)